!pip install wordcloud
!pip install pandas
import pandas as pd
df = pd.read_csv('processed_moments.csv',header=0)
df.shape
df.head(20)
#from counting the number of sentences in happy moment we can get that with the decreasing of number of sentence in happy moments, the frequency also decreases.
df.num_sentence.value_counts()
# as we can see from the ground_truth_category, in people's happy moments, affection is the most frequent , then is achivement.
df.ground_truth_category.value_counts()
import wordcloud
text = ' '.join(df.cleaned_hm)
len(text)
import os
from os import path
from wordcloud import WordCloud
# Generate a word cloud image
wordcloud = WordCloud().generate(text)
# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
# # lower max_font_size
# wordcloud = WordCloud(max_font_size=40).generate(text)
# plt.figure()
# plt.imshow(wordcloud, interpolation="bilinear")
# plt.axis("off")
# plt.show()
!pip install matplotlib
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
# d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
# Read the whole text.
# text = open(path.join(d, 'alice.txt')).read()
# read the mask / color image taken from
# http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
alice_coloring = np.array(Image.open('lyw.jpg'))
stopwords = set(STOPWORDS)
stopwords.add("said")
wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring,
stopwords=stopwords, max_font_size=40, random_state=42)
# generate word cloud
wc.generate(text)
# create coloring from image
image_colors = ImageColorGenerator(alice_coloring)
# show
fig, axes = plt.subplots(1, 3)
axes[0].imshow(wc, interpolation="bilinear")
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
axes[1].imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
axes[2].imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
for ax in axes:
ax.set_axis_off()
plt.show()
fig, axes = plt.subplots(1, 3)
fig.set_size_inches(18.5, 10.5)
axes[0].imshow(wc, interpolation="bilinear")
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
axes[1].imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
axes[2].imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
for ax in axes:
ax.set_axis_off()
plt.show()
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
# d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
# Read the whole text.
# text = open(path.join(d, 'alice.txt')).read()
# read the mask / color image taken from
# http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
alice_coloring = np.array(Image.open('33.jpg'))
stopwords = set(STOPWORDS)
stopwords.add("said")
wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring,
stopwords=stopwords, max_font_size=40, random_state=42)
# generate word cloud
wc.generate(text)
# create coloring from image
image_colors = ImageColorGenerator(alice_coloring)
# show
fig, axes = plt.subplots(1, 3)
axes[0].imshow(wc, interpolation="bilinear")
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
axes[1].imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
axes[2].imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
for ax in axes:
ax.set_axis_off()
plt.show()
fig, axes = plt.subplots(1, 3)
fig.set_size_inches(18.5, 10.5)
axes[0].imshow(wc, interpolation="bilinear")
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
axes[1].imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
axes[2].imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
for ax in axes:
ax.set_axis_off()
plt.show()
dg = pd.read_csv('demographic.csv',header=0)
dg.head()
df.columns
#we can see there are wid, age... feature in dg dataset.
dg.columns
#combine two csv file by common wid column, which represents work id.
joined = df.join(dg.set_index('wid'),on='wid')
joined.head()
def is_float(s):
""" Returns True is string is a number. """
try:
float(s)
return True
except ValueError:
return False
#we can see in joined dataset, there are 100344 rows and 16 columns
joined = joined[joined.age.apply(is_float)]
joined.age = joined.age.astype(float)
joined.shape
# we count the number of different ages, most of the age concentrate from 19 to 43.
joined.age.value_counts().sort_index()
#select people whose age below 25, and see the number is 24191.
joined[joined.age<=25].shape
#select people whose age larger than 25 and below 40, the number of people is 59393.
joined[(joined.age>25) & (joined.age<=40)].shape
#select people whose age larger than 40, the number of people is 16667.
joined[joined.age>40].shape
# we count the number of people in different country.
joined.country.value_counts()
# the number of American is 78896.
joined[joined.country=='USA'].shape
#The number of indian is 16713.
joined[joined.country=='IND'].shape
# the number of people who are not American and Indian is 4735.
joined[(joined.country!='USA') & (joined.country!='IND')].shape
# conditions
age = []
age.append(joined.age<=25)
age.append((joined.age>25) & (joined.age<=40))
age.append(joined.age>40)
country = []
country.append(joined.country=='USA')
country.append(joined.country=='IND')
country.append((joined.country!='USA') & (joined.country!='IND'))
# Here we used the processed data to make plot.
import os
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = [[0,0,0] for _ in range(3)]
title1 = ['[0,25]','(25,40]','(40,100]']
title2 = ['USA','IND','Others']
f, axe = plt.subplots(3, 3)
f.set_size_inches(20,20)
for i in range(3):
for k in range(3):
text = ' '.join(joined[age[i]&country[k]].text.astype(str))
# Generate a word cloud image
wordcloud[i][k] = WordCloud(width=400,height=400,background_color='white').generate(text)
# Display the generated image:
# the matplotlib way:
axe[i][k].imshow(wordcloud[i][k], interpolation='bilinear')
axe[i][k].axis("off")
t = 'Age:'+title1[i]+' Country:'+title2[k]
axe[i][k].set_title(t)
plt.show()
#Attention! These plots are from processed data.
# from the plot we can see that for people under 25 and in USA ,Indian or other coutries, there most happy moment is related to friend.
# For people in 25-40, the importance of home and love imporved.
# For people whose age are larger than 40, there happy moments are related to family, daughter and son.
#Plus we can see that there is no much difference between different countries.
import os
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = [[0,0,0] for _ in range(3)]
title1 = ['[0,25]','(25,40]','(40,100]']
title2 = ['USA','IND','Others']
f, axe = plt.subplots(3, 3)
f.set_size_inches(20,20)
for i in range(3):
for k in range(3):
text = ' '.join(joined[age[i]&country[k]].cleaned_hm.astype(str))
# Generate a word cloud image
wordcloud[i][k] = WordCloud(width=400,height=400,background_color='white').generate(text)
# Display the generated image:
# the matplotlib way:
axe[i][k].imshow(wordcloud[i][k], interpolation='bilinear')
axe[i][k].axis("off")
t = 'Age:'+title1[i]+' Country:'+title2[k]
axe[i][k].set_title(t)
plt.show()
#Attention! These plots are from original data.
# We can still say that from the plot we can see that for people under 25 and in USA ,Indian or other coutries, there most happy moment is related to friend.
#Compared with processed data, we can see that we get less useful information from the plot. For example, for people under 25 in other countries, the sentence got is the most freqent one.
#Also, in the fourth picture, the word made is as frequent as happy, it's hard for us to analyze.
#Also, in the fifth picture, the word day is as frequent as happy, it's hard for us to analyze.
#So, it's important for us to preprocess the data.